import urllib2
from BeautifulSoup import BeautifulSoup

#startyear = 1999
#startmonth = 9
#startmsg = 0

#page = urllib2.urlopen("http://biowww.dfci.harvard.edu/~hellas/2006/03.2006/msg00000.html")
#soup = BeautifulSoup(page)
##print soup
##print soup.findAll('pre')
#for content in soup.find('pre'):
#    print content.strip()
import codecs
if __name__ == '__main__':

    file = codecs.open('hellas06-07.txt', mode = 'wb', encoding='utf-8')
    for year in range (2006,2007):
        for month in range (1,12+1):
            bad_count = 0
            msg = 0

            while bad_count<5:
                print year,month,msg                
                try:
                    url = 'http://biowww.dfci.harvard.edu/~hellas/%d/%02d.%d/msg%05d.html'%(year,month,year,msg)
                    page = urllib2.urlopen(url)
                    soup = BeautifulSoup(page)
                    content = soup.find('pre')
                    if content:
#                        print content
                        file.write(unicode(content))
                    else:
                        bad_count+=1
#                        file.write('\r\n##==========-==========##\r\n')
                except urllib2.URLError:
                    bad_count+=1
                    print 'Bad url',url
                msg+=1
                    
            
                
            


    

